This notebook summarizes the findings from assigning consensus cell
type labels to all ScPCA samples. All results from the
cell-type-consensus module in OpenScPCA-nf
must be saved to results prior to rendering this
notebook.
suppressPackageStartupMessages({
# load required packages
library(ggplot2)
})
# Set default ggplot theme
theme_set(
theme_classic()
)
Functions
# function to read in project data frames with all cells in a project
# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types
summarize_celltypes <- function(file, id){
# read in data
df <- readr::read_tsv(file)
# get total cell count and number of assigned cell types per library
total_cells_df <- df |>
dplyr::group_by(library_id) |>
dplyr::summarize(
total_cells_per_library = length(library_id),
num_celltypes = length(unique(consensus_annotation))
)
summary_df <- df |>
dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |>
dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |>
dplyr::left_join(total_cells_df, by = "library_id") |>
dplyr::mutate(
# add percentage
percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2)
) |>
dplyr::ungroup()
return(summary_df)
}
Data setup
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-consensus")
# results directory with cell-type-consensus
results_dir <- file.path(module_base, "results", "cell-type-consensus")
# diagnoses table used for labeling plots
diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv")
# list all results files
results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)
# get project ids from file list
project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
names(results_files) <- project_ids
# remove cell line projects from file list
cell_line_projects <- c("SCPCP000020", "SCPCP000024")
project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects
results_files <- results_files[project_ids]
# read in diagnoses
diagnoses_df <- readr::read_tsv(diagnoses_file)
# read in results and prep data frame for plotting
all_results_df <- results_files |>
purrr::imap(summarize_celltypes) |>
dplyr::bind_rows(.id = "project_id") |>
# add in diagnoses
dplyr::left_join(diagnoses_df, by = "project_id") |>
dplyr::mutate(
# create a label for plotting
project_label = glue::glue("{project_id}:{diagnosis}")
)
Is it all just Unknown?
The first thing we will look at is how many of the cells in each
sample are categorized as “Unknown”, which means no consensus between
SingleR and CellAssign was identified.
unknown_only <- all_results_df |>
dplyr::filter(consensus_annotation == "Unknown")
ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) +
ggforce::geom_sina(size = 0.1) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
plot.margin = margin(10,10,10,10)) +
labs(
x = "",
y = "Percent of cells annotated as Unknown"
)

NA
It looks like we do have some samples that aren’t just all “Unknown”!
It definitely varies by project, but for most projects we at least see
some proportion of samples with assigned cell types.
Let’s look at how many samples actually have some cells outside of
unknown identified. To do this, we will identify all libraries that only
have cells called as “Unknown”.
high_tumor_df <- unknown_only |>
dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |>
dplyr::group_by(project_label) |>
dplyr::summarize(all_unknown = sum(no_cells_identified),
classified_cells = sum(!no_cells_identified),
percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2),
# add number of libraries for plotting
total_libraries = length(library_id)) |>
# set order for plots
dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE))
Which projects have the highest proportion of samples with all
“Unknown”?
# table with percentage of samples
high_tumor_df |>
dplyr::select(project_label, percentage_unknown) |>
dplyr::arrange(desc(percentage_unknown))
NA
It looks like all projects do have cell types identified that are not
“Unknown”. However, SCPCP000011 (retinoblastoma), has a
fairly high percentage of samples without any consensus labels.
Number of cell types observed
Below we look at the number of cell types observed in each project
for all samples. This does not include cells labeled as “Unknown”.
num_celltypes_df <- all_results_df |>
# add a new line for facet labels
dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |>
# remove unknown as a cell type
dplyr::filter(consensus_annotation != "Unknown") |>
dplyr::select(facet_label, library_id, num_celltypes) |>
unique()
ggplot(num_celltypes_df, aes(x = num_celltypes)) +
geom_histogram(binwidth = 1, center = 0) +
facet_wrap(vars(facet_label),
ncol = 3) +
labs(
x = "Number of cell types"
) +
theme_bw()

Distribution of consensus cell types
Now we look at the distribution of the cell types in each sample. For
these plots, we will pull out the top 9 cell types for each project. All
other cells will be labeled with “All remaining cell types”.
The top cell types are determined by counting how many libraries each
cell type is found in within a project and taking the most frequent
types.
plot_df <- all_results_df |>
dplyr::group_by(project_id) |>
dplyr::mutate(
# get most frequently observed cell types across libraries in that project
top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |>
# sort by frequency
forcats::fct_infreq() |>
# make sure all remaining and unknown are last, use this to assign colors in specific order
forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
)
Warning: There was 1 warning in `dplyr::mutate()`.
ℹ In argument: `top_celltypes = forcats::fct_relevel(...)`.
ℹ In group 19: `project_id = "SCPCP000021"`.
Caused by warning:
! 1 unknown level in `f`: All remaining cell types
# get all unique cell types ordered by frequency
unique_celltypes <- plot_df |>
dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |>
dplyr::pull(top_celltypes) |>
unique() |>
sort() |>
as.character()
# get color palette
colors <- c(
palette.colors(palette = "alphabet"),
"black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once
"grey60",
"grey95"
)
names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown")
project_labels <- unique(all_results_df$project_label)
# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown
project_labels |>
purrr::map(\(label){
project_df <- plot_df |>
dplyr::filter(project_label == label) |>
dplyr::mutate(
# relevel factors for specific project
top_celltypes = forcats::fct_infreq(top_celltypes) |>
forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
)
# make a stacked bar chart with top cell types
ggplot(project_df) +
aes(
x = library_id,
y = percent_cells_annotation,
fill = top_celltypes
) +
geom_col() +
scale_y_continuous(expand = c(0,0)) +
scale_fill_manual(values = colors, name = "cell type") +
ggtitle(label) +
theme(axis.text.x = element_blank())
}) |>
patchwork::wrap_plots(ncol = 1)

This looks really promising! A few observations:
- Cell types identified tend to line up with expectations for the type
of tumor. For example, leukemia libraries have T and B cells, brain
tumors have macrophages, and solid tumors have fibroblasts and muscle
cells.
- Projects that I would expect to be more difficult to classify
(sarcomas, wilms, RB) have fewer cells classified then things like brain
and leukemia. Notably many of the solid tumor projects (4, 5, 12-16, and
23) have a handful of PDX samples where I would expect to see fewer
normal cells.
Most frequently observed cell types
The last thing we will do is look at the most frequently observed
cell types across all samples. The below table is ordered by the number
of libraries the cell type is observed.
all_results_df |>
dplyr::filter(consensus_annotation != "Unknown") |>
dplyr::group_by(consensus_annotation) |>
dplyr::summarize(
total_libraries = dplyr::n(),
min_percentage = min(percent_cells_annotation),
mean_percentage = round(mean(percent_cells_annotation), 2),
median_percentage = median(percent_cells_annotation),
max_percentage = max(percent_cells_annotation)
) |>
dplyr::arrange(desc(total_libraries))
NA
Session info
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
---
title: "Explore consensus cell types"
author: Ally Hawkins
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: show
---

This notebook summarizes the findings from assigning consensus cell type labels to all ScPCA samples. 
All results from the `cell-type-consensus` module in `OpenScPCA-nf` must be saved to `results` prior to rendering this notebook. 

```{r packages}
suppressPackageStartupMessages({
  # load required packages
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)
```

## Functions 

```{r}
# function to read in project data frames with all cells in a project
# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types 
summarize_celltypes <- function(file, id){
  
  # read in data
  df <- readr::read_tsv(file) 
  
  # get total cell count and number of assigned cell types per library
  total_cells_df <- df |> 
    dplyr::group_by(library_id) |> 
    dplyr::summarize(
      total_cells_per_library = length(library_id),
      num_celltypes = length(unique(consensus_annotation))
    )
  
  summary_df <- df |> 
    dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |> 
    dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |>
    dplyr::left_join(total_cells_df, by = "library_id") |> 
    dplyr::mutate(
      # add percentage 
      percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2)
    ) |> 
    dplyr::ungroup()
  
  return(summary_df)
  
}
```

## Data setup


```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-consensus")

# results directory with cell-type-consensus 
results_dir <- file.path(module_base, "results", "cell-type-consensus")

# diagnoses table used for labeling plots 
diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv")
```

```{r}
# list all results files 
results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)

# get project ids from file list  
project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
names(results_files) <- project_ids

# remove cell line projects from file list
cell_line_projects <- c("SCPCP000020", "SCPCP000024")
project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects
results_files <- results_files[project_ids]
```


```{r, message=FALSE}
# read in diagnoses
diagnoses_df <- readr::read_tsv(diagnoses_file)


# read in results and prep data frame for plotting 
all_results_df <- results_files |> 
  purrr::imap(summarize_celltypes) |> 
  dplyr::bind_rows(.id = "project_id") |> 
  # add in diagnoses 
  dplyr::left_join(diagnoses_df, by = "project_id") |> 
  dplyr::mutate(
    # create a label for plotting
    project_label = glue::glue("{project_id}:{diagnosis}")
  )

```

## Is it all just Unknown?

The first thing we will look at is how many of the cells in each sample are categorized as "Unknown", which means no consensus between `SingleR` and `CellAssign` was identified. 

```{r, fig.height=7}
unknown_only <- all_results_df |> 
  dplyr::filter(consensus_annotation == "Unknown")

ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) +
  ggforce::geom_sina(size = 0.1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        plot.margin = margin(10,10,10,10)) +
  labs(
    x = "", 
    y = "Percent of cells annotated as Unknown"
  )
  
```

It looks like we do have some samples that aren't just all "Unknown"!
It definitely varies by project, but for most projects we at least see some proportion of samples with assigned cell types. 

Let's look at how many samples actually have some cells outside of unknown identified. 
To do this, we will identify all libraries that only have cells called as "Unknown". 

```{r}
high_tumor_df <- unknown_only |> 
  dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |> 
  dplyr::group_by(project_label) |> 
  dplyr::summarize(all_unknown = sum(no_cells_identified),
                   classified_cells = sum(!no_cells_identified),
                   percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2),
                   # add number of libraries for plotting 
                   total_libraries = length(library_id)) |>
  # set order for plots 
  dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE))
```


Which projects have the highest proportion of samples with all "Unknown"? 

```{r}
# table with percentage of samples 
high_tumor_df |> 
  dplyr::select(project_label, percentage_unknown) |> 
  dplyr::arrange(desc(percentage_unknown))

```

It looks like all projects do have cell types identified that are not "Unknown". 
However, `SCPCP000011` (retinoblastoma), has a fairly high percentage of samples without any consensus labels. 

## Number of cell types observed

Below we look at the number of cell types observed in each project for all samples. 
This does not include cells labeled as "Unknown". 

```{r, fig.height=10}
num_celltypes_df <- all_results_df |> 
  # add a new line for facet labels 
  dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |>
  # remove unknown as a cell type 
  dplyr::filter(consensus_annotation != "Unknown") |> 
  dplyr::select(facet_label, library_id, num_celltypes) |> 
  unique()

ggplot(num_celltypes_df, aes(x = num_celltypes)) +
  geom_histogram(binwidth = 1, center = 0) +
  facet_wrap(vars(facet_label), 
             ncol = 3) +
  labs(
    x = "Number of cell types"
  ) +
  theme_bw()
```

## Distribution of consensus cell types 

Now we look at the distribution of the cell types in each sample. 
For these plots, we will pull out the top 9 cell types for each project. 
All other cells will be labeled with "All remaining cell types". 

The top cell types are determined by counting how many libraries each cell type is found in within a project and taking the most frequent types. 

```{r}
plot_df <- all_results_df |> 
    dplyr::group_by(project_id) |> 
    dplyr::mutate(
      # get most frequently observed cell types across libraries in that project 
      top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |> 
        # sort by frequency 
        forcats::fct_infreq() |> 
        # make sure all remaining and unknown are last, use this to assign colors in specific order
        forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
    )

# get all unique cell types ordered by frequency 
unique_celltypes <- plot_df |> 
  dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |> 
  dplyr::pull(top_celltypes) |> 
  unique() |>
  sort() |> 
  as.character()

# get color palette
colors <- c(
  palette.colors(palette = "alphabet"),
  "black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once 
  "grey60", 
  "grey95"
)
names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown")
```


```{r, fig.height=60, fig.width=10}
project_labels <- unique(all_results_df$project_label)

# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown
project_labels |> 
  purrr::map(\(label){
    
    project_df <- plot_df |> 
      dplyr::filter(project_label == label) |> 
      dplyr::mutate(
        # relevel factors for specific project 
        top_celltypes = forcats::fct_infreq(top_celltypes) |> 
          forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
      )
    
    # make a stacked bar chart with top cell types 
    ggplot(project_df) + 
      aes(
        x = library_id, 
        y = percent_cells_annotation, 
        fill = top_celltypes
      ) +
      geom_col() + 
      scale_y_continuous(expand = c(0,0)) +
      scale_fill_manual(values = colors, name = "cell type") +
      ggtitle(label) +
      theme(axis.text.x = element_blank())
  
    }) |>
  patchwork::wrap_plots(ncol = 1)
```


This looks really promising!
A few observations: 

- Cell types identified tend to line up with expectations for the type of tumor. 
For example, leukemia libraries have T and B cells, brain tumors have macrophages, and solid tumors have fibroblasts and muscle cells. 
- Projects that I would expect to be more difficult to classify (sarcomas, wilms, RB) have fewer cells classified then things like brain and leukemia. 
Notably many of the solid tumor projects (4, 5, 12-16, and 23) have a handful of PDX samples where I would expect to see fewer normal cells. 

## Most frequently observed cell types 

The last thing we will do is look at the most frequently observed cell types across all samples. 
The below table is ordered by the number of libraries the cell type is observed. 

```{r}
all_results_df |> 
  dplyr::filter(consensus_annotation != "Unknown") |> 
  dplyr::group_by(consensus_annotation) |> 
  dplyr::summarize(
    total_libraries = dplyr::n(),
    min_percentage = min(percent_cells_annotation),
    mean_percentage = round(mean(percent_cells_annotation), 2),
    median_percentage = median(percent_cells_annotation),
    max_percentage = max(percent_cells_annotation)
  ) |> 
  dplyr::arrange(desc(total_libraries))
  
```


## Session info 

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```

